import argparse
import sys
sys.path.append("../FinRL")
import gym
import scipy.optimize
import pandas as pd

import torch
from models import *
from replay_memory import Memory
from running_state import ZFilter
from torch.autograd import Variable
from utils import *
from trpo import one_step_trpo,conjugate_gradients,trpo_step
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv
from finrl.agents.stablebaselines3.models import DRLAgent
from stable_baselines3.common.logger import configure
from finrl.meta.data_processor import DataProcessor

from finrl.plot import backtest_stats, backtest_plot, get_daily_return, get_baseline
from pprint import pprint
import itertools
from finrl import config
from finrl import config_tickers
import os
from finrl.main import check_and_make_directories
from finrl.config import (
    DATA_SAVE_DIR,
    TRAINED_MODEL_DIR,
    TENSORBOARD_LOG_DIR,
    RESULTS_DIR,
    INDICATORS,
)
check_and_make_directories([DATA_SAVE_DIR, TRAINED_MODEL_DIR, TENSORBOARD_LOG_DIR, RESULTS_DIR])

import csv, random

import pickle

from copy import deepcopy

def setup_training_env():
    TRAIN_START_DATE = '2010-01-01'
    TRAIN_END_DATE = '2021-10-01'
    TRADE_START_DATE = '2021-10-01'
    TRADE_END_DATE = '2023-03-01'

    df = YahooDownloader(start_date = TRAIN_START_DATE, end_date = TRADE_END_DATE, ticker_list = config_tickers.DOW_30_TICKER).fetch_data()
    fe = FeatureEngineer(use_technical_indicator=True, tech_indicator_list = INDICATORS, use_vix=True, use_turbulence=True, user_defined_feature = False)

    processed = fe.preprocess_data(df)

    list_ticker = processed["tic"].unique().tolist()
    list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str))
    combination = list(itertools.product(list_date,list_ticker))

    processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left")
    processed_full = processed_full[processed_full['date'].isin(processed['date'])]
    processed_full = processed_full.sort_values(['date','tic'])

    processed_full = processed_full.fillna(0)

    #mvo_df = processed_full.sort_values(['date','tic'],ignore_index=True)[['date','tic','close']]

    TRAIN_START_DATE = '2021-01-01'
    TRAIN_END_DATE = '2022-01-01'

    train = data_split(processed_full, TRAIN_START_DATE,TRAIN_END_DATE)

    stock_dimension = len(train.tic.unique())
    state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
    buy_cost_list = sell_cost_list = [0.001] * stock_dimension
    num_stock_shares = [0] * stock_dimension
    env_kwargs = {
        "hmax": 100,
        "initial_amount": 10000,
        "num_stock_shares": num_stock_shares,
        "buy_cost_pct": buy_cost_list,
        "sell_cost_pct": sell_cost_list,
        "state_space": state_space,
        "stock_dim": stock_dimension,
        "tech_indicator_list": INDICATORS,
        "action_space": stock_dimension,
        "reward_scaling": 1
    }
    e_train_gym = StockTradingEnv(df = train, **env_kwargs)
    env_train, _ = e_train_gym.get_sb_env()
    return env_train, train

torch.utils.backcompat.broadcast_warning.enabled = True
torch.utils.backcompat.keepdim_warning.enabled = True
torch.set_default_tensor_type('torch.DoubleTensor')

parser = argparse.ArgumentParser(description='PyTorch actor-critic example')
parser.add_argument('--gamma', type=float, default=0.995, metavar='G',
                    help='discount factor (default: 0.995)')
parser.add_argument('--env-name', default="StockMarket", metavar='G',
                    help='name of the environment to run')
parser.add_argument('--tau', type=float, default=0.97, metavar='G',
                    help='gae (default: 0.97)')
parser.add_argument('--meta-reg', type=float, default=0.001, metavar='G',
                    help='meta regularization regression (default: 0.001)') 
parser.add_argument('--meta-lambda', type=float, default=0.5, metavar='G', 
                    help='meta meta-lambda (default: 0.5)')  
parser.add_argument('--max-kl', type=float, default=1e-2, metavar='G',
                    help='max kl value (default: 1e-2)')
parser.add_argument('--damping', type=float, default=0e-5, metavar='G',
                    help='damping (default: 0e-1)')
parser.add_argument('--seed', type=int, default=543, metavar='N',
                    help='random seed (default: 1)')
parser.add_argument('--batch-size', type=int, default=20, metavar='N',
                    help='batch-size (default: 20)') 
parser.add_argument('--task-batch-size', type=int, default=5, metavar='N',
                    help='task-batch-size (default: 5)')
parser.add_argument('--render', action='store_true',
                    help='render the environment')
parser.add_argument('--log-interval', type=int, default=1, metavar='N',
                    help='interval between training status logs (default: 1)')
parser.add_argument('--index', type=int, default=1, metavar='N',
                    help='index (default: 1)')
parser.add_argument('--max-length', type=int, default=252, metavar='N',
                    help='max length of a path (default: 100)')
parser.add_argument('--lower-opt', type=str, default="Adam", metavar='N',
                    help='lower-opt (default: Adam)')
args = parser.parse_args()

torch.manual_seed(args.seed)
#if args.env_name=="HalfCheetah-v4":
#    env = gym.make(args.env_name,exclude_current_positions_from_observation=False)
#else:
#    env = gym.make(args.env_name)
env, train_dataset=setup_training_env()
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

index = args.index

model_lower="Adam"
if args.lower_opt=="Adam":
    model_lower="Adam"
elif args.lower_opt=="adagrad":
    model_lower="Adagrad"
elif args.lower_opt=="rmsprop":
    model_lower="RMSprop"
elif args.lower_opt=="sgd":
    model_lower="SGD"

running_state = ZFilter((num_inputs,), clip=5)
if os.path.exists("./check_point/"+str(args.env_name)+"_running_state_"+model_lower+".pkl"):
    with open("./check_point/"+str(args.env_name)+"_running_state_"+model_lower+".pkl",'rb') as file:
        running_state  = pickle.loads(file.read())

print(model_lower, "running_state: ",running_state.rs.n) 
print("index: ", index)

def select_action(state,policy_net):
    state = torch.from_numpy(state).unsqueeze(0)
    action_mean, _, action_std = policy_net(Variable(state))
    action = torch.normal(action_mean, action_std)
    return action

def select_action_test(state,policy_net):
    state = torch.from_numpy(state).unsqueeze(0)
    action_mean, _, action_std = policy_net(Variable(state))
    return action_mean

def sample_data_for_task_specific(dates,policy_net,batch_size):
    memory = Memory()
    memory_extra=Memory()

    accumulated_raward_batch = 0
    num_episodes = 0
    for i in range(batch_size):
        state = env.reset()
        state = running_state(state)

        reward_sum = 0
        for t in range(args.max_length):
            action = select_action(state,policy_net)
            action = action.data[0].numpy()
            next_state, reward, done, truncated = env.step(action)
            if t in dates:
                reward=reward-sum(i for i in action if i > 0)
            reward = reward[0]
            reward_sum += reward
            next_state = running_state(next_state)
            path_number = i

            memory.push(state, np.array([action]), path_number, next_state, reward)
            if args.render:
                env.render()
            state = next_state
            if done or truncated:
                break
    
        env._elapsed_steps=0
        for t in range(args.max_length):
            action = select_action(state,policy_net)
            action = action.data[0].numpy()
            next_state, reward, done, truncated= env.step(action)
            if t in dates:
                reward=reward-sum(i for i in action if i > 0)
            reward = reward[0]
            next_state = running_state(next_state)
            path_number = i

            memory_extra.push(state, np.array([action]), path_number, next_state, reward)
            if args.render:
                env.render()
            state = next_state
            if done or truncated:
                break

        num_episodes += 1
        accumulated_raward_batch += reward_sum

    accumulated_raward_batch /= num_episodes
    batch = memory.sample()
    batch_extra = memory_extra.sample()

    return batch,batch_extra,accumulated_raward_batch


def compute_adavatage(batch,batch_extra,batch_size):
    rewards = torch.Tensor(np.array(batch.reward))
    path_numbers = torch.Tensor(np.array(batch.path_number))
    actions = torch.Tensor(np.array(np.concatenate(batch.action, 0)))
    states = torch.Tensor(np.array(batch.state))

    rewards_extra = torch.Tensor(np.array(batch_extra.reward))
    path_numbers_extra = torch.Tensor(np.array(batch_extra.path_number))
    actions_extra = torch.Tensor(np.array(np.concatenate(batch_extra.action, 0)))
    states_extra = torch.Tensor(np.array(batch_extra.state))

    returns = torch.Tensor(actions.size(0),1)
    prev_return=torch.zeros(batch_size,1)

    k=batch_size-1
    for i in reversed(range(rewards_extra.size(0))):
        if not int(path_numbers_extra[i].item())==k:
            k=k-1
            assert k==path_numbers_extra[i].item()
        prev_return[k,0]=rewards_extra[i]+ args.gamma * prev_return[k,0] 
        
    for i in reversed(range(rewards.size(0))):
        returns[i] = rewards[i] + args.gamma * prev_return[int(path_numbers[i].item()),0]
        prev_return[int(path_numbers[i].item()),0] = returns[i, 0]

    targets = Variable(returns)
    return targets

def task_specific_adaptation(task_specific_policy,meta_policy_net_copy,batch,q_values,meta_lambda_now,index): 
    actions = torch.Tensor(np.concatenate(batch.action, 0))
    states = torch.Tensor(np.array(batch.state))

    action_means, action_log_stds, action_stds = meta_policy_net_copy(Variable(states))
    fixed_log_prob = normal_log_density(Variable(actions), action_means, action_log_stds, action_stds).detach().clone().data

    def get_loss():
        action_means1, action_log_stds1, action_stds1 = task_specific_policy(Variable(states))
        log_prob = normal_log_density(Variable(actions), action_means1, action_log_stds1, action_stds1)
        aaaa=torch.exp(log_prob - Variable(fixed_log_prob))
        action_loss = -Variable(q_values) *  torch.special.expit(2.0*aaaa-2.0)*2 
        #action_loss = -Variable(q_values) * aaaa
        return action_loss.mean()  

    def get_kl():
        mean1, log_std1, std1 = task_specific_policy(Variable(states))
        mean_previous, log_std_previous, std_previous = meta_policy_net_copy(Variable(states))

        mean0 = mean_previous.clone().detach().data.double()
        log_std0 = log_std_previous.clone().detach().data.double()
        std0 = std_previous.clone().detach().data.double()

        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)
    
    def get_kl2():
        mean1, log_std1, std1 = task_specific_policy(Variable(states))
        mean_previous, log_std_previous, std_previous = meta_policy_net_copy(Variable(states))

        mean0 = mean_previous.clone().detach().data.double()
        log_std0 = log_std_previous.clone().detach().data.double()
        std0 = std_previous.clone().detach().data.double()

        kl = log_std0 - log_std1 + (std1.pow(2) + (mean1 - mean0).pow(2)) / (2.0 * std0.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)
    
    def get_kl3():
        policy_dictance=torch.tensor(0.0)
        for i,param in enumerate(task_specific_policy.parameters()):
            policy_dictance += (param-list(meta_policy_net_copy.parameters())[i].clone().detach().data).pow(2).sum() 
        return policy_dictance
    if index==1:
        one_step_trpo(task_specific_policy, get_loss, get_kl,meta_lambda_now,args.lower_opt) 
    elif index==2:
        one_step_trpo(task_specific_policy, get_loss, get_kl2,meta_lambda_now,args.lower_opt) 
    elif index==3:
        one_step_trpo(task_specific_policy, get_loss, get_kl3,meta_lambda_now,args.lower_opt) 

    return task_specific_policy

def kl_divergence(meta_policy_net1,task_specific_policy1,batch,index):
    if index==1:
        states = torch.Tensor(np.array(batch.state))
        mean1, log_std1, std1 = task_specific_policy1(Variable(states))
        mean0, log_std0, std0 = meta_policy_net1(Variable(states))
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True).mean()
    elif index==2:
        states = torch.Tensor(np.array(batch.state))
        mean1, log_std1, std1 = task_specific_policy1(Variable(states))
        mean0, log_std0, std0 = meta_policy_net1(Variable(states))
        kl = log_std0 - log_std1 + (std1.pow(2) + (mean1 - mean0).pow(2)) / (2.0 * std0.pow(2)) - 0.5
        return kl.sum(1, keepdim=True).mean()
    elif index==3:
        policy_dictance=torch.tensor(0.0)
        for param,param1 in zip(task_specific_policy1.parameters(),meta_policy_net1.parameters()):
            policy_dictance += (param-param1).pow(2).sum() 
        return policy_dictance

def policy_gradient_obain(task_specific_policy,after_batch,after_q_values):
    actions = torch.Tensor(np.array(np.concatenate(after_batch.action, 0)))
    states = torch.Tensor(np.array(after_batch.state))
    fixed_action_means, fixed_action_log_stds, fixed_action_stds = task_specific_policy(Variable(states))
    fixed_log_prob = normal_log_density(Variable(actions), fixed_action_means, fixed_action_log_stds, fixed_action_stds).detach().clone().data
    afteradap_action_means, afteradap_action_log_stds, afteradap_action_stds = task_specific_policy(Variable(states))
    log_prob = normal_log_density(Variable(actions), afteradap_action_means, afteradap_action_log_stds, afteradap_action_stds)
    AAAAA=torch.exp(log_prob - Variable(fixed_log_prob))
    #bbbbb=torch.min(Variable(after_q_values)*AAAAA,Variable(after_q_values)*AAAAA*torch.clamp(AAAAA,0.8,1.2))
    bbbbb=Variable(after_q_values)*AAAAA
    #bbbbb=Variable(after_q_values)*torch.special.expit(2.0*AAAAA-2.0)*2
    
    J_loss = (-bbbbb).mean()
    for param in task_specific_policy.parameters():
        param.grad.zero_()
    J_loss.backward(retain_graph=False)
    policy_grad = [param2.grad.data.clone() for param2 in task_specific_policy.parameters()]

    return J_loss, policy_grad

def loss_obain_new(task_specific_policy,meta_policy_net_copy,after_batch,after_q_values):
    actions = torch.Tensor(np.array(np.concatenate(after_batch.action, 0)))
    states = torch.Tensor(np.array(after_batch.state))
    fixed_action_means, fixed_action_log_stds, fixed_action_stds = meta_policy_net_copy(Variable(states))
    fixed_log_prob = normal_log_density(Variable(actions), fixed_action_means, fixed_action_log_stds, fixed_action_stds).detach().data.clone()
    afteradap_action_means, afteradap_action_log_stds, afteradap_action_stds = task_specific_policy(Variable(states))
    log_prob = normal_log_density(Variable(actions), afteradap_action_means, afteradap_action_log_stds, afteradap_action_stds)
    aaaaa=torch.exp(log_prob - Variable(fixed_log_prob))
    J_loss = (-Variable(after_q_values) * torch.special.expit(2.0*aaaaa-2.0)*2 ).mean()
    #J_loss = (-Variable(after_q_values) * aaaaa).mean()
    
    return J_loss

def turbulence_dates(turbulence_threshold):
    dates=[]
    for i in range(train_dataset['turbulence'].nunique()):
        if train_dataset['turbulence'][i].tolist()[0]>=turbulence_threshold:
            dates.append(i)
    dates=sorted(set(dates))
    return dates

if __name__ == "__main__":
    turbulence_threshold_set = np.loadtxt('training_tasks.txt')
    if not os.path.exists("./check_point/"+str(args.env_name)+"_meta_policy_net_"+model_lower+".pkl"):
        meta_policy_net = Policy(num_inputs, num_actions)
    else:
        meta_policy_net = torch.load("./check_point/"+str(args.env_name)+"_meta_policy_net_"+model_lower+".pkl")

    "--------------------------------------------------for initialization of running_state------------------------------------------"
    for i in range(args.batch_size*5):
        state = env.reset()
        state = running_state(state)
        for t in range(args.max_length):
            action = select_action(state,meta_policy_net)
            action = action.data[0].numpy()
            next_state, reward, done, truncated = env.step(action)
            next_state = running_state(next_state)

    aaaaaa=-10000

    for i_episode in range(500):
        print("i_episode: ",i_episode)
        meta_lambda_now=args.meta_lambda
        print("meta_lambda_now: ",meta_lambda_now)

        x_list=[]
        task_specific_policy_list=[]
        batch_list=[]
        
        for task_number in range(args.task_batch_size):
            turbulence_threshold=np.random.choice(turbulence_threshold_set)
            dates=turbulence_dates(turbulence_threshold)
            batch,batch_extra,accumulated_raward_batch=sample_data_for_task_specific(dates,meta_policy_net,args.batch_size)
            print('i_episode',i_episode)
            print('accumulated_reward_batch',accumulated_raward_batch)
            print('(before adaptation) Episode {}\tAverage reward {:.2f}'.format(i_episode, accumulated_raward_batch))
            
            q_values = compute_adavatage(batch,batch_extra,args.batch_size)
            q_values2 = q_values
            q_values1 = (q_values - q_values.mean())

            task_specific_policy=Policy(num_inputs, num_actions)
            meta_policy_net_copy=Policy(num_inputs, num_actions)
            for i,param in enumerate(task_specific_policy.parameters()):
                param.data.copy_(list(meta_policy_net.parameters())[i].clone().detach().data)
            for i,param in enumerate(meta_policy_net_copy.parameters()):
                param.data.copy_(list(meta_policy_net.parameters())[i].clone().detach().data)
            task_specific_policy=task_specific_adaptation(task_specific_policy,meta_policy_net_copy,batch,q_values1,meta_lambda_now,index)

            after_batch,after_batch_extra,after_accumulated_raward_batch=sample_data_for_task_specific(dates,task_specific_policy,args.batch_size*5) 
            print('(after adaptation) Episode {}\tAverage reward {:.2f}'.format(i_episode, after_accumulated_raward_batch)) 

            q_values_after = compute_adavatage(after_batch,after_batch_extra,args.batch_size*5) 
            q_values_after = (q_values_after - q_values_after.mean())

            kl_phi_theta=kl_divergence(meta_policy_net,task_specific_policy,batch,index)

            _, policy_gradient_main_term= policy_gradient_obain(task_specific_policy,after_batch,q_values_after)

            loss_for_1term=loss_obain_new(task_specific_policy,meta_policy_net,batch,q_values1)
            
            #(\nabla_\phi^2 kl_phi_theta+loss_for_1term) x= policy_gradient_2term
            def d_theta_2_kl_phi_theta_loss_for_1term(v):
                grads = torch.autograd.grad(kl_phi_theta+loss_for_1term/meta_lambda_now, task_specific_policy.parameters(), create_graph=True,retain_graph=True)
                flat_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads])
                kl_v = (flat_grad_kl * Variable(v)).sum()
                grads_new = torch.autograd.grad(kl_v, task_specific_policy.parameters(), create_graph=True,retain_graph=True)
                flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads_new]).data.clone()
                return flat_grad_grad_kl
            policy_gradient_main_term_flat=torch.cat([grad.contiguous().view(-1) for grad in policy_gradient_main_term]).data
            x = conjugate_gradients(d_theta_2_kl_phi_theta_loss_for_1term, policy_gradient_main_term_flat, 10)
            x_list.append(x.data)
            task_specific_policy_list.append(task_specific_policy)
            batch_list.append(batch)
        
        def get_loss(volatile=False):
            overall_loss=0.0

            for task_number in range(args.task_batch_size):
                
                task_specific_policy=task_specific_policy_list[task_number]
                batch=batch_list[task_number]
                x=x_list[task_number]

                kl_phi_theta_1=kl_divergence(meta_policy_net,task_specific_policy,batch,index)
                grads_1 = torch.autograd.grad(kl_phi_theta_1, task_specific_policy.parameters(), create_graph=True,retain_graph=True)
                flat_grad_kl_1 = torch.cat([grad.contiguous().view(-1) for grad in grads_1])
                kl_v_1 = -(flat_grad_kl_1 * x).sum() 

                overall_loss=overall_loss+kl_v_1*1.0/args.task_batch_size

            return overall_loss
        
        states = torch.cat([torch.Tensor(np.array(batch_list[0].state)),torch.Tensor(np.array(batch_list[1].state)),torch.Tensor(np.array(batch_list[2].state)),torch.Tensor(np.array(batch_list[3].state)),torch.Tensor(np.array(batch_list[4].state))],dim=0)
        mean101, log_std101, std101 = meta_policy_net(Variable(states))
        mean0 = mean101.clone().detach().data.double()
        log_std0 = log_std101.clone().detach().data.double()
        std0 = std101.clone().detach().data.double()
    
        def get_kl():
            mean1, log_std1, std1 = meta_policy_net(Variable(states))  
            kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (2.0 * std1.pow(2)) - 0.5
            return kl.sum(1, keepdim=True)

        trpo_step(meta_policy_net, get_loss, get_kl, args.max_kl, args.damping)
        
        turbulence_list000=[45,46,47,48,49]
        len_turbulence_list000=len(turbulence_list000)
        result_before=np.zeros(len_turbulence_list000)
        result_after=np.zeros(len_turbulence_list000)
        for task_number_test in range(len_turbulence_list000):
            turbulence=turbulence_list000[task_number_test]
            dates = turbulence_dates(turbulence)
            batch,batch_extra,accumulated_raward_batch=sample_data_for_task_specific(dates,meta_policy_net,args.batch_size)
            result_before[task_number_test]=accumulated_raward_batch
    
            q_values = compute_adavatage(batch,batch_extra,args.batch_size) 
            q_values = (q_values - q_values.mean())

            task_specific_policy=Policy(num_inputs, num_actions)
            meta_policy_net_copy=Policy(num_inputs, num_actions)
            for i,param in enumerate(task_specific_policy.parameters()):
                param.data.copy_(list(meta_policy_net.parameters())[i].clone().detach().data)
            for i,param in enumerate(meta_policy_net_copy.parameters()):
                param.data.copy_(list(meta_policy_net.parameters())[i].clone().detach().data)
            task_specific_policy=task_specific_adaptation(task_specific_policy,meta_policy_net_copy,batch,q_values,meta_lambda_now,index)

            after_batch,after_batch_extra,after_accumulated_raward_batch=sample_data_for_task_specific(dates,task_specific_policy,args.batch_size)
            result_after[task_number_test]=after_accumulated_raward_batch
        
        print('iteration',i_episode)
        print("result_before: ",result_before.mean())
        print("result_after: ",result_after.mean())

        with open("./check_point/"+str(args.env_name)+"_training_log.csv", 'a+') as file:
            writer = csv.writer(file)
            writer.writerow([i_episode, result_after.mean()])
        
        if result_after.mean()>aaaaaa:
            print("save model")
            aaaaaa=result_after.mean()
            torch.save(meta_policy_net, "./check_point/"+str(args.env_name)+"_meta_policy_net_"+model_lower+".pkl")
            output_hal = open("./check_point/"+str(args.env_name)+"_running_state_"+model_lower+".pkl", 'wb')
            str1 = pickle.dumps(running_state)
            output_hal.write(str1)
            output_hal.close()
        
        #torch.save(meta_policy_net, "./check_point/meta_policy_net_"+model_lower+"_"+str(i_episode)+".pkl")
        #output_hal = open("./check_point/running_state_"+model_lower+"_"+str(i_episode)+".pkl", 'wb')
        #str1 = pickle.dumps(running_state)
        #output_hal.write(str1)
        #output_hal.close()

        print(torch.exp(meta_policy_net.action_log_std)) 

